{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 08 Interquartile range (IQR)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%html\n",
""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from pandas import Series, DataFrame\n",
"import matplotlib.pyplot as plt\n",
"from scipy import stats"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[khanacademy](https://www.khanacademy.org/math/ap-statistics/summarizing-quantitative-data-ap/measuring-spread-quantitative/v/calculating-interquartile-range-iqr?modal=1)\n",
"[pandas.DataFrame.quantile](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.quantile.html)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"x_data = {'x': [4,4,6,7,10,11,12,14,15]}"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"y_data = {'y': [7, 9, 9, 10, 10, 11, 12, 12, 14]}"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"x_df = DataFrame(x_data)\n",
"y_df = DataFrame(y_data)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" x | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 9.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 9.222222 | \n",
"
\n",
" \n",
" std | \n",
" 4.146618 | \n",
"
\n",
" \n",
" min | \n",
" 4.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 6.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 10.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 12.000000 | \n",
"
\n",
" \n",
" max | \n",
" 15.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" x\n",
"count 9.000000\n",
"mean 9.222222\n",
"std 4.146618\n",
"min 4.000000\n",
"25% 6.000000\n",
"50% 10.000000\n",
"75% 12.000000\n",
"max 15.000000"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" y | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 9.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 10.444444 | \n",
"
\n",
" \n",
" std | \n",
" 2.068279 | \n",
"
\n",
" \n",
" min | \n",
" 7.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 9.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 10.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 12.000000 | \n",
"
\n",
" \n",
" max | \n",
" 14.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" y\n",
"count 9.000000\n",
"mean 10.444444\n",
"std 2.068279\n",
"min 7.000000\n",
"25% 9.000000\n",
"50% 10.000000\n",
"75% 12.000000\n",
"max 14.000000"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x 12.0\n",
"Name: 0.75, dtype: float64\n",
"x 12\n",
"Name: 0.75, dtype: int64\n",
"y 12.0\n",
"Name: 0.75, dtype: float64\n",
"y 12\n",
"Name: 0.75, dtype: int64\n"
]
}
],
"source": [
"print(x_df.quantile(q=0.75))\n",
"print(x_df.quantile(q=0.75, interpolation='nearest'))\n",
"\n",
"print(y_df.quantile(q=0.75))\n",
"print(y_df.quantile(q=0.75, interpolation='nearest'))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6.0\n",
"3.0\n"
]
}
],
"source": [
"print(stats.iqr(x_df))\n",
"print(stats.iqr(y_df))"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"x_df['Rank'] = x_df.index + 1\n",
"x_df['Empirical_CDF'] = x_df['Rank'] / x_df.shape[0]\n",
"x_q_25 = x_df.x[x_df['Empirical_CDF']>=0.25].reset_index(drop=True)[0]\n",
"x_q_50 = x_df.x[x_df['Empirical_CDF']>=0.50].reset_index(drop=True)[0]\n",
"x_q_75 = x_df.x[x_df['Empirical_CDF']>=0.75].reset_index(drop=True)[0]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"y_df['Rank'] = y_df.index + 1\n",
"y_df['Empirical_CDF'] = y_df['Rank'] / y_df.shape[0]\n",
"y_q_25 = y_df.y[y_df['Empirical_CDF']>=0.25].reset_index(drop=True)[0]\n",
"y_q_50 = y_df.y[y_df['Empirical_CDF']>=0.50].reset_index(drop=True)[0]\n",
"y_q_75 = y_df.y[y_df['Empirical_CDF']>=0.75].reset_index(drop=True)[0]"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"q 25 6 q 50 10 q 75 12 => iqr = 6\n",
"q 25 9 q 50 10 q 75 12 => iqr = 3\n"
]
}
],
"source": [
"print(f'q 25 {x_q_25} q 50 {x_q_50} q 75 {x_q_75} => iqr = {x_q_75 - x_q_25}')\n",
"print(f'q 25 {y_q_25} q 50 {y_q_50} q 75 {y_q_75} => iqr = {y_q_75 - y_q_25}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}